Remove Outlier Samples
- A distance matrix is used to remove the replicate most unlike the
others if the coefficient of variation is greater than 10%
slope.new <- respiration %>%
separate(Sample_Name, c("ECA", "kit", "rep"), sep = "_", remove = FALSE) %>%
mutate(Treat = case_when(grepl("W",rep)~"Wet",
grepl("D", rep) ~"Dry")) %>%
unite(kit_treat, kit, Treat, remove = FALSE) %>%
group_by(kit_treat) %>%
mutate(Mean_Slope_All = mean(slope_of_the_regression)) %>%
mutate(cv_before_removal = (abs(sd(slope_of_the_regression)/mean(slope_of_the_regression)))*100) %>%
ungroup() %>%
dplyr::select(c(Sample_Name, kit_treat, kit,rep,slope_of_the_regression,rate_mg_per_L_per_min,rate_mg_per_L_per_h, Treat, Mean_Slope_All, cv_before_removal))
slope.new$flag <- NA
slope.final <- as.data.frame(matrix(NA, ncol = 13, nrow =1))
colnames(slope.final) = c("slope.temp","Sample_Name", "kit_treat", "kit", "rep", "rate_mg_per_L_per_min","rate_mg_per_L_per_h", "Treat", "Mean_Slope_All","cv_before_removal", "cv_after_removal", "Mean_Slope_Removed","flag")
unique.samples = unique(slope.new$kit_treat)
for (i in 1:length(unique.samples)) {
data_subset = subset(slope.new, slope.new$kit_treat == unique.samples[i])
slope.temp = as.numeric(data_subset$slope_of_the_regression)
slope.temp.sd <- sd(slope.temp)
slope.temp.mean <- mean(slope.temp)
CV = abs((slope.temp.sd/slope.temp.mean)*100)
#looping to get 4 out of 5 best samples
for (sample.reduction in 1:5) {
if (slope.temp.mean == 0) {
CV = 0
}
else if (length(slope.temp) > 4 & CV >= 10) {
dist.temp = as.matrix(abs(dist(slope.temp)))
dist.comp = numeric()
for(slope.now in 1:ncol(dist.temp)) {
dist.comp = rbind(dist.comp,c(slope.now,sum(dist.temp[,slope.now])))
}
dist.comp[,2] = as.numeric(dist.comp[,2])
slope.temp = slope.temp[-which.max(dist.comp[,2])]
slope.temp.sd <- sd(slope.temp)
slope.temp.mean <- mean(slope.temp)
slope.temp.cv <- abs((slope.temp.sd/slope.temp.mean)*100)
CV = slope.temp.cv
slope.temp.range <- max(slope.temp) - min(slope.temp)
range = slope.temp.range
}
}
if (length(slope.temp) >= 3) {
slope.combined <- as.data.frame(slope.temp)
slope.removed <- merge(slope.combined, data_subset, by.x = "slope.temp", by.y = "slope_of_the_regression", all.x = TRUE)
slope.removed <- slope.removed[!duplicated(slope.removed$Sample_Name), ]
slope.removed$cv_after_removal = as.numeric(abs((sd(slope.temp)/mean(slope.temp))*100))
slope.removed$Mean_Slope_Removed = as.numeric(mean(slope.temp))
}
slope.final = rbind(slope.removed, slope.final)
rm('slope.temp')
}
## This data frame has removed samples
slope.final$flag <- ifelse(slope.final$cv_before_removal < slope.final$cv_after_removal, "Issue in dropping sample", NA)
slope.final <- rename(slope.final, "slope_of_the_regression" = "slope.temp")
slope.final$rem <- abs(slope.final$slope_of_the_regression) - slope.final$rate_mg_per_L_per_min
slope.final <- slope.final[!is.na(slope.final$Sample_Name),]
Calculate Effect Size/Make Histograms of data with no removals
data_clean <- slope.new %>%
mutate(slope_of_the_regression = if_else(slope_of_the_regression>0,0,slope_of_the_regression)) %>%
mutate(rate_mg = if_else(slope_of_the_regression>=0,0,rate_mg_per_L_per_min)) %>%
mutate(Mean_Slope_All = if_else(Mean_Slope_All >0,0,Mean_Slope_All)) %>%
mutate(Mean_Slope = abs(Mean_Slope_All))
wet_no <- ggplot(subset(data_clean, Treat %in% "Wet"), aes(x = rate_mg_per_L_per_min)) +
geom_histogram()+
ggtitle("Wet Rates, No Removals")+
theme(strip.text = element_text(
size = 4))
dry_no <- ggplot(subset(data_clean, Treat %in% "Dry"), aes(x = rate_mg_per_L_per_min)) +
geom_histogram()+
ggtitle("Dry Rates, No Removals")+
theme(strip.text = element_text(
size = 4))
data_clean_effect <- data_clean %>%
filter(!grepl(011, kit)) %>%
filter(!grepl(012, kit)) %>%
distinct(kit_treat, .keep_all = TRUE) %>%
group_by(kit) %>%
mutate(effect = (Mean_Slope[Treat == "Wet"] - Mean_Slope[Treat == "Dry"])) %>%
mutate(log_effect = log10(abs(effect+1))) %>%
distinct(kit, .keep_all = TRUE)
effect_no <- ggplot(data_clean_effect, aes(x = effect)) +
geom_histogram() +
ggtitle("Effect Size, No Removals")+
theme(strip.text = element_text(
size = 4))
data_clean_removals = slope.final %>%
mutate(slope_of_the_regression = if_else(slope_of_the_regression>0,0,slope_of_the_regression)) %>%
mutate(rate_mg = if_else(slope_of_the_regression>=0,0,rate_mg_per_L_per_min)) %>%
mutate(Mean_Slope_Removed = if_else(Mean_Slope_Removed>0,0,Mean_Slope_Removed)) %>%
dplyr::select(c(Sample_Name,rate_mg, Mean_Slope_Removed)) %>%
rename(rate_mg_per_L_per_min = rate_mg) %>%
mutate(rate_mg_per_L_per_h = rate_mg_per_L_per_min*60) %>%
separate(Sample_Name, into = c("EC", "kit", "INC"), remove = FALSE) %>%
separate(Sample_Name, into = c("ID", "rep"), sep = "-", remove = FALSE) %>%
mutate(Treat = case_when(grepl("W",rep)~"Wet",
grepl("D", rep) ~"Dry")) %>%
unite(kit_treat, c("kit", "Treat"), sep = "_", remove = FALSE)
## Warning: Expected 3 pieces. Additional pieces discarded in 452 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
wet_rem <- ggplot(subset(data_clean_removals, Treat %in% "Wet"), aes(x = rate_mg_per_L_per_min)) +
geom_histogram()+
ggtitle("Wet Rates, Removals")+
theme(strip.text = element_text(
size = 4))
dry_rem <- ggplot(subset(data_clean_removals, Treat %in% "Dry"), aes(x = rate_mg_per_L_per_min)) +
geom_histogram()+
ggtitle("Dry Rates, Removals")+
theme(strip.text = element_text(
size = 4))
data_clean_rem_effect <- data_clean_removals %>%
filter(!grepl(011, kit)) %>%
filter(!grepl(012, kit)) %>%
mutate(Mean_Slope = abs(Mean_Slope_Removed)) %>%
distinct(kit_treat, .keep_all = TRUE) %>%
group_by(kit) %>%
mutate(effect = (Mean_Slope[Treat == "Wet"] - Mean_Slope[Treat == "Dry"])) %>%
mutate(log_effect = log10(abs(effect+1))) %>%
distinct(kit, .keep_all = TRUE)
effect_rem <- ggplot(data_clean_rem_effect, aes(x = effect)) +
geom_histogram() +
ggtitle("Effect Size, No Removals")+
theme(strip.text = element_text(
size = 4))
all <- (wet_no + wet_rem + dry_no + dry_rem + effect_no + effect_rem) +
plot_layout(widths = c(3,3))
print(all)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

effect.path <- paste0("C:/Users/",pnnl.user,"/PNNL/Core Richland and Sequim Lab-Field Team - Documents/Data Generation and Files/ECA/Optode multi reactor/Optode_multi_reactor_incubation/effect size/Effect_Size_Data/")
#write.csv(data_clean_rem_effect, paste0(effect.path,"Effect_Size_merged_by_", pnnl.user,"_on_",Sys.Date(),".csv"), row.names = F)
respiration.removals.path <- paste0("C:/Users/",pnnl.user,"/PNNL/Core Richland and Sequim Lab-Field Team - Documents/Data Generation and Files/ECA/Optode multi reactor/Optode_multi_reactor_incubation/rates/Plots/All_Respiration_Rates/")
#write.csv(data_clean_removals, paste0(respiration.removals.path,"removed_respiration_merged_by_", pnnl.user,"_on_",Sys.Date(),".csv"), row.names = F)